Let’s load the datasets.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1.9000 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.3.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::vars() masks ggplot2::vars()
user_stats <- read_csv("./data/user-stats.csv") %>%
filter(Valid == "Yes") %>%
mutate(screen_name = tolower(`twitter account`)) %>%
select(screen_name, LABEL) %>% #remove userlevel, e.g., followersCount, favoritesCount, friendsCount
mutate_if(is.numeric, log)
## Parsed with column specification:
## cols(
## .default = col_character(),
## statusesCount = col_integer(),
## followersCount = col_integer(),
## favoritesCount = col_integer(),
## friendsCount = col_integer(),
## created = col_datetime(format = ""),
## protected = col_logical(),
## verified = col_logical(),
## id = col_double(),
## listedCount = col_integer(),
## followRequestSent = col_logical()
## )
## See spec(...) for full column specifications.
t <- read_csv("./data/full-tweets-14days.csv") %>%
mutate(Date = format(created_at, tz="America/New_York"),
text = iconv(text, to="UTF-8"),
screen_name = tolower(screen_name),
DateFlag = case_when(
Date < "2017-05-30" ~ "Train",
TRUE ~ "Test"
)) %>%
inner_join(user_stats, by = "screen_name")
## Parsed with column specification:
## cols(
## .default = col_character(),
## retweet_count = col_double(),
## favorited = col_logical(),
## truncated = col_logical(),
## id_str = col_double(),
## retweeted = col_logical(),
## created_at = col_datetime(format = ""),
## in_reply_to_status_id_str = col_double(),
## in_reply_to_user_id_str = col_double(),
## listed_count = col_double(),
## verified = col_logical(),
## user_id_str = col_double(),
## geo_enabled = col_logical(),
## user_created_at = col_datetime(format = ""),
## statuses_count = col_double(),
## followers_count = col_double(),
## favourites_count = col_double(),
## protected = col_logical(),
## utc_offset = col_double(),
## friends_count = col_double()
## )
## See spec(...) for full column specifications.
count <- count(t, screen_name, sort = TRUE)
# remove accounts with less than 40 tweets through the 2 weeks
t <- filter(t, !(screen_name %in% count$screen_name[count$n < 40]))
library(textfeatures)
## standardize function
scale_standard <- function(x) (x - 0) / (max(x, na.rm = TRUE) - 0)
## convert to long (tidy) form and plot
p <- group_by(t, screen_name, DateFlag) %>%
textfeatures() %>%
inner_join(user_stats, by = "screen_name") %>%
ungroup()
#%>% mutate_if(is.numeric, scale_standard)
table(p$LABEL)
##
## clickbait hoax propaganda realnews satire
## 62 6 156 62 18
p <- p %>%
mutate(y = case_when(LABEL == "realnews" ~ 1L,
TRUE ~ 0L)) %>%
mutate(y = as.factor(y)) %>%
select(-LABEL)
# Split up the data set
set.seed(3456)
train <- filter(p, DateFlag == "Train") %>% select(-screen_name, -DateFlag)
test <- filter(p, DateFlag == "Test") %>% select(-screen_name, -DateFlag)
model <- randomForest::randomForest(y ~ ., data = train, xtest = test[,-18], ytest = test$y, localImp = TRUE)
model
##
## Call:
## randomForest(formula = y ~ ., data = train, xtest = test[, -18], ytest = test$y, localImp = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 9.21%
## Confusion matrix:
## 0 1 class.error
## 0 116 5 0.04132231
## 1 9 22 0.29032258
## Test set error rate: 3.29%
## Confusion matrix:
## 0 1 class.error
## 0 120 1 0.008264463
## 1 4 27 0.129032258
https://mi2datalab.github.io/randomForestExplainer/
library(randomForestExplainer)
# rerun without test for rfForest
model <- randomForest::randomForest(y ~ ., data = train, localImp = TRUE)
min_depth_frame <- min_depth_distribution(model)
plot_min_depth_distribution(min_depth_frame)
importance_frame <- measure_importance(model)
knitr::kable(importance_frame)
| variable | mean_min_depth | no_of_nodes | accuracy_decrease | gini_decrease | no_of_trees | times_a_root | p_value |
|---|---|---|---|---|---|---|---|
| n_caps | 2.606790 | 404 | 0.0190791 | 4.2742470 | 282 | 64 | 0.0000045 |
| n_capsp | 1.867438 | 465 | 0.0435479 | 8.4636211 | 324 | 114 | 0.0000000 |
| n_chars | 3.673252 | 253 | 0.0040539 | 1.5851752 | 200 | 19 | 0.9999870 |
| n_charsperword | 3.958493 | 229 | 0.0036038 | 1.2966192 | 183 | 14 | 1.0000000 |
| n_commas | 3.651820 | 268 | 0.0051611 | 1.5784511 | 210 | 19 | 0.9994925 |
| n_digits | 3.617910 | 293 | 0.0011254 | 1.2254618 | 233 | 0 | 0.9630694 |
| n_exclaims | 1.968170 | 533 | 0.0250244 | 5.2892613 | 377 | 55 | 0.0000000 |
| n_extraspaces | 3.490928 | 324 | 0.0046426 | 1.8361239 | 242 | 6 | 0.4932735 |
| n_hashtags | 3.128891 | 384 | 0.0167857 | 3.1999076 | 278 | 11 | 0.0003856 |
| n_lowers | 2.658716 | 367 | 0.0169866 | 4.4457087 | 260 | 74 | 0.0074937 |
| n_lowersp | 1.954313 | 469 | 0.0379129 | 7.4267165 | 330 | 98 | 0.0000000 |
| n_mentions | 3.658568 | 275 | 0.0051994 | 1.6103046 | 217 | 2 | 0.9978979 |
| n_nonasciis | 3.895066 | 235 | 0.0004246 | 0.8781595 | 197 | 1 | 1.0000000 |
| n_periods | 3.872637 | 260 | 0.0025483 | 1.2706910 | 204 | 7 | 0.9999200 |
| n_puncts | 3.762780 | 270 | 0.0041782 | 1.6757341 | 206 | 0 | 0.9992242 |
| n_urls | 4.188138 | 197 | 0.0004920 | 0.8991808 | 161 | 2 | 1.0000000 |
| n_words | 3.605517 | 271 | 0.0069132 | 1.7665049 | 217 | 14 | 0.9990461 |
importance_frame %>%
mutate(variable = as.factor(variable)) %>%
ggplot(aes(x = forcats::fct_reorder(variable, accuracy_decrease), y = accuracy_decrease)) +
geom_col() +
coord_flip() +
labs(title = "Feature Importance: Decrease in Accuracy",
x = "Decrease in Accuracy after Removing Feature",
y = "Feature")
top9 <- importance_frame %>%
arrange(desc(accuracy_decrease)) %>%
head(n=9) %>%
select(variable) %>%
mutate(variable = as.character(variable))
train %>%
select(c(top9$variable, "y")) %>%
gather(metric, value, -y) %>%
mutate(Type = ifelse(y==1L,"Real News","Suspicious")) %>%
ggplot(aes(x = value, fill = Type)) +
geom_density(alpha = 0.4) +
facet_wrap(~metric, scales = "free")
z <- p %>%
filter(DateFlag == "Train") %>%
select(screen_name, n_capsp, n_exclaims, y) %>%
gather(metric, value, -screen_name, -y) %>%
mutate(Type = ifelse(y==1L, "Real News","Suspicious")) %>%
ggplot(aes(x = metric, y = value, color = Type, text = screen_name)) +
geom_jitter(height = 0, width = 0.2) +
coord_flip()
plotly::ggplotly(z, tooltip = c("x","y","text"))
filter(t, screen_name %in% c("govtslaves","aff_patriots", "henrymakow")) %>%
select(text, created_at, screen_name) %>%
split(.$screen_name) %>%
head(n=10)
## $aff_patriots
## # A tibble: 182 x 3
## text created_at screen_name
## <chr> <dttm> <chr>
## 1 BREAKING: James Bond Star DEAD - https… 2017-05-23 14:46:38 aff_patrio…
## 2 BREAKING: SHOTS FIRED! - https://t.co/… 2017-05-23 16:14:28 aff_patrio…
## 3 BREAKING: Terrifying Details EMERGE Ab… 2017-05-23 17:19:19 aff_patrio…
## 4 BREAKING: Mad Dog Mattis Just Sent N. … 2017-05-23 17:28:09 aff_patrio…
## 5 Internet Absolutely ERUPTS After Every… 2017-05-23 18:04:33 aff_patrio…
## 6 CNN Goes TOO Far With What They Just D… 2017-05-23 19:02:21 aff_patrio…
## 7 BREAKING: America Just Launched MASSIV… 2017-05-23 20:28:53 aff_patrio…
## 8 Multiple DEAD After Suicide Bombing AT… 2017-05-23 20:27:18 aff_patrio…
## 9 BREAKING: ISIS Just EXPANDED To This C… 2017-05-23 21:18:35 aff_patrio…
## 10 BREAKING: New York On HIGH ALERT - htt… 2017-05-23 22:17:36 aff_patrio…
## # ... with 172 more rows
##
## $govtslaves
## # A tibble: 520 x 3
## text created_at screen_name
## <chr> <dttm> <chr>
## 1 HOW WILL BREXIT AFFECT THE UK ONLINE G… 2017-05-23 11:34:59 govtslaves
## 2 HUGE BITCOIN CORRECTION NOW IMMINENT A… 2017-05-23 16:02:04 govtslaves
## 3 JUDGE: IT’S OK IF BEST BUY’S GEEK SQUA… 2017-05-23 16:06:46 govtslaves
## 4 OVER THE LAST 10 YEARS THE U.S. ECONOM… 2017-05-23 16:25:30 govtslaves
## 5 JOE LIEBERMAN ATOP FBI WOULD BE A FIRS… 2017-05-23 17:22:58 govtslaves
## 6 TRUMP’S SON-IN-LAW IS A BALTIMORE SLUM… 2017-05-23 19:52:35 govtslaves
## 7 BLACK LIVES MATTER AWARDED 2017 SYDNEY… 2017-05-23 22:13:31 govtslaves
## 8 GOOGLE STARTS TRACKING OFFLINE SHOPPIN… 2017-05-23 22:20:57 govtslaves
## 9 THE BOMBINGS HAVE ONLY JUST BEGUN: “PR… 2017-05-23 22:30:42 govtslaves
## 10 PAUL PLANS TO FORCE VOTE ON $110B SAUD… 2017-05-24 14:31:01 govtslaves
## # ... with 510 more rows
##
## $henrymakow
## # A tibble: 529 x 3
## text created_at screen_name
## <chr> <dttm> <chr>
## 1 "RT @DrDavidDuke: 'The Rise Of The Goo… 2017-05-23 07:29:46 henrymakow
## 2 "Netflix, Feminist suppress film that … 2017-05-23 07:24:08 henrymakow
## 3 See the Masonic checkerboard on their … 2017-05-23 07:27:28 henrymakow
## 4 This will continue until we all realiz… 2017-05-23 07:30:52 henrymakow
## 5 RT @CollegeFix: Professor who called d… 2017-05-23 07:28:06 henrymakow
## 6 Making Israel Great Again https://t.co… 2017-05-23 07:34:40 henrymakow
## 7 "Assange accuser connected to CIA\nhtt… 2017-05-23 12:26:00 henrymakow
## 8 "A new recruit.\nhttps://t.co/xV2H53BY… 2017-05-23 12:28:18 henrymakow
## 9 "Whistleblower- Surgeon says Rich was … 2017-05-23 12:24:30 henrymakow
## 10 "Democrat operative ran interference i… 2017-05-23 12:31:30 henrymakow
## # ... with 519 more rows
filter(t, screen_name %in% c("unhealthytruth","100percfedup", "twitchyteam")) %>%
select(text, created_at, screen_name) %>%
filter(grepl('!', text)) %>% # keep only those with !'s
split(.$screen_name) %>%
head(n=10)
## $`100percfedup`
## # A tibble: 231 x 3
## text created_at screen_name
## <chr> <dttm> <chr>
## 1 "WATCH: Brave Christian Brits Attempt … 2017-05-23 04:17:07 100percfed…
## 2 WOW! HOLLYWOOD CONSERVATIVE Stands Up … 2017-05-23 07:00:03 100percfed…
## 3 New post: UNREAL! THE LEFT OUTRAGED AT… 2017-05-23 17:11:36 100percfed…
## 4 "UNREAL! THE LEFT OUTRAGED AT TRUMP'S … 2017-05-23 18:00:04 100percfed…
## 5 New post: DINGBAT NANCY STRIKES AGAIN!… 2017-05-23 18:14:16 100percfed…
## 6 RT @pat22372: NANCY STRIKES AGAIN! Wat… 2017-05-23 18:29:24 100percfed…
## 7 DINGBAT NANCY STRIKES AGAIN! Watch Nan… 2017-05-23 18:58:45 100percfed…
## 8 RT @seanhannity: IMPORTANT! Mediamatte… 2017-05-23 23:07:10 100percfed…
## 9 New post: BOOM! TREY GOWDY Hammers Ex-… 2017-05-23 23:26:46 100percfed…
## 10 "BOOM! TREY GOWDY Hammers Ex-CIA Chief… 2017-05-24 00:00:04 100percfed…
## # ... with 221 more rows
##
## $twitchyteam
## # A tibble: 171 x 3
## text created_at screen_name
## <chr> <dttm> <chr>
## 1 Wait, WHAT!? Wife of Bernie Sanders ha… 2017-05-23 04:25:31 twitchyteam
## 2 Cowards! AP censors ITSELF in weak hea… 2017-05-23 13:42:02 twitchyteam
## 3 ‘Look who now supports voter ID!’: Dem… 2017-05-23 15:07:10 twitchyteam
## 4 ICYMI ==> Wait, WHAT!? Wife of Bern… 2017-05-23 15:19:06 twitchyteam
## 5 Move over InfoWars, here comes CNN! An… 2017-05-23 15:44:19 twitchyteam
## 6 ‘How were you RAISED?!’ Dana Loesch bl… 2017-05-23 16:17:47 twitchyteam
## 7 DAAAAAMN! John Kincade levels scumbag … 2017-05-23 16:38:59 twitchyteam
## 8 ‘Go BACK to the woods!’ Hillary’s Manc… 2017-05-23 18:38:22 twitchyteam
## 9 Cowards! AP censors ITSELF in weak hea… 2017-05-23 19:06:41 twitchyteam
## 10 ‘No sh*t, Sherlocks’! Prepare to be st… 2017-05-23 20:31:07 twitchyteam
## # ... with 161 more rows
##
## $unhealthytruth
## # A tibble: 67 x 3
## text created_at screen_name
## <chr> <dttm> <chr>
## 1 Huge victory!! https://t.co/4lScqdQGAP 2017-05-23 19:00:04 unhealthyt…
## 2 This is HUGE, people, and making mains… 2017-05-23 22:59:20 unhealthyt…
## 3 We are being censored. Watch for yours… 2017-05-24 00:13:15 unhealthyt…
## 4 I still cannot BELIEVE CNN did this st… 2017-05-24 02:38:28 unhealthyt…
## 5 Check it out! https://t.co/M6U0VoOqbW 2017-05-24 08:29:02 unhealthyt…
## 6 Good news! https://t.co/F6uxdiAXh6 2017-05-24 11:45:02 unhealthyt…
## 7 Good to know! https://t.co/wfRtev7Uhn 2017-05-24 22:29:04 unhealthyt…
## 8 Wow! : ) https://t.co/TjnkJGhCL0 2017-05-25 01:46:02 unhealthyt…
## 9 Check it out! https://t.co/fJTrFm10Bx 2017-05-25 12:07:03 unhealthyt…
## 10 Absolutely!! https://t.co/DrXdtrRFL1 2017-05-25 14:00:05 unhealthyt…
## # ... with 57 more rows
# library(lime); library(caret)
# trainIndex <- createDataPartition(p$y, p = .8,
# list = FALSE,
# times = 1)
#
# x_train <- p[trainIndex,] %>% select(-y, -screen_name)
# x_test <- p[-trainIndex,] %>% select(-y, -screen_name)
#
# y_train <- p$y[trainIndex]
# y_test <- p$y[-trainIndex]
#
# # Create Random Forest model on iris data
# model <- train(x_train, y_train, method = 'rf', localImp = TRUE)
#
# train <- p[trainIndex,] %>% select(-screen_name)
# test <- p[-trainIndex,] %>% select(-screen_name)
# # Create an explainer object
# explainer <- lime(x_train, model)
#
# # Explain new observation
# explanation <- explain(x_test, explainer, n_labels = 1, n_features = 2)
#
# plot_features(explanation)
n_chars = n_charS(text2),
n_commas = n_commas(text2),
n_digits = n_digits(text2),
n_exclaims = n_exclaims(text2),
n_extraspaces = n_extraspaces(text2),
n_hashtags = n_hashtags(text),
n_lowers = n_lowers(text2),
n_lowersp = (n_lowers + 1L) / (n_chars + 1L),
n_mentions = n_mentions(text),
n_periods = n_periods(text2),
n_urls = n_urls(text),
n_words = n_words(text2),
n_caps = n_caps(text2),
n_nonasciis = n_nonasciis(text2),
n_puncts = n_puncts(text2),
n_capsp = (n_caps + 1L) / (n_chars + 1L),
n_charsperword = (n_chars + 1L) / (n_words + 1L)
n_words <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
x <- gsub("\\d", "", x)
x <- strsplit(x, "\\s+")
x <- lengths(x)
x[na] <- NA_integer_
x
}
n_charS <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
x <- gsub("\\s", "", x)
x <- nchar(x)
x[na] <- NA_integer_
x
}
n_digits <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
x <- nchar(gsub("\\D", "", x))
x[na] <- NA_integer_
x
}
n_hashtags <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
m <- gregexpr("#\\S+", x)
x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
x[na] <- NA_integer_
x
}
n_mentions <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
m <- gregexpr("@\\S+", x)
x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
x[na] <- NA_integer_
x
}
n_commas <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
m <- gregexpr(",+", x)
x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
x[na] <- NA_integer_
x
}
n_periods <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
m <- gregexpr("\\.+", x)
x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
x[na] <- NA_integer_
x
}
n_exclaims <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
m <- gregexpr("\\!+", x)
x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
x[na] <- NA_integer_
x
}
n_extraspaces <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
m <- gregexpr("\\s{2,}|\\t|\\n", x)
x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
x[na] <- NA_integer_
x
}
n_caps <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
m <- gregexpr("[[:upper:]]", x)
x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
x[na] <- NA_integer_
x
}
n_lowers <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
m <- gregexpr("[[:lower:]]", x)
x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
x[na] <- NA_integer_
x
}
n_urls <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
m <- gregexpr("https?:", x)
x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
x[na] <- NA_integer_
x
}
n_nonasciis <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
x <- iconv(x, from = "UTF-8", to = "ASCII", sub = "[NONASCII]")
m <- gregexpr("\\[NONASCII\\]", x)
x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
x[na] <- NA_integer_
x
}
n_puncts <- function(x) {
na <- is.na(x)
if (all(na)) return(0)
x <- gsub("!|\\.|\\,", "", x)
m <- gregexpr("[[:punct:]]", x)
x <- vply_int(m, ~ sum(. > 0, na.rm = TRUE))
x[na] <- NA_integer_
x
}